
/* 
Create a pik-year spine, then pik-year work history, then worker characteristics, 
then pik-year wind plant exposure (later), then merge. 

[] brackets indicate redacted internal census variable name or directory that can't be disclosed. 

*/

libname ehf '[directory location of employment history files]';
libname indic '[directory location of national indicators file]';
libname controls '[directory location of individual characteristics file]';

libname interm '/projects/users/########/Snapshot2022/IntermediateData';
libname tempdat '/projects/users/########/Snapshot2022/IntermediateData/TempData';
libname res21 '[directory location of worker residence files]';



/* 
First get the piks for anyone who ever lived in our 23 states. 
*/
%macro pikspine;
  %let stfip = 01 04 05 06 08 10 11 17 18 19 20 23 24 29 30 31 32 35 38 40 47 51 56;
	*stcodes = al ar az ca co dc de ia il in ks md me mo mt nd ne nm nv ok tn va wy;
  %local I next_fip;
  %let I = 1;
  %do %while (%scan(&stfip, &I) ne );
    %let next_fip = %scan(&stfip, &I);
	*GET ALL WORKER PIKS EVER LIVED IN A GIVEN STATE;
      *temporary file of all residence piks, for 2000 or later;
	DATA tempdat.icfcpr_&next_fip;
	  SET res21.[residence file 1] (KEEP = [geocode] pik [year]);
	  IF SUBSTR([geocode],1,2) = "&next_fip" AND [year] >=2000;
	  DROP [geocode] [year];
	RUN;
	DATA tempdat.icfrcf_&next_fip;
	  SET res21.[residence file 2] (KEEP = [county id] pik [year]);
	  IF SUBSTR([county id],1,2) = "&next_fip" AND [year] >=2000;
	  DROP [county id] [year];
	RUN;
	proc sort data=tempdat.icfcpr_&next_fip nodupkey;
	  by pik;
	run;
	proc sort data=tempdat.icfrcf_&next_fip nodupkey;
	  by pik;
	run;
	proc append base = tempdat.icfcpr_&next_fip data=tempdat.icfrcf_&next_fip;
	run;
	proc sort data=tempdat.icfcpr_&next_fip nodupkey;
	  by pik;
	run;
	proc append base = interm.pikspine data=tempdat.icfcpr_&next_fip;
	run;
	proc datasets library = tempdat nolist;
	  delete icfcpr_&next_fip icfrcf_&next_fip;
	run; quit;
    %let I = %eval(&I + 1);
  %end;
%mend;

%pikspine;

proc sort data=interm.pikspine nodupkey;
  by pik;
run;

/* 
Now get their addresses in each year. 
	- need to deal with years had duplicate piks, different addresses in [residence file 1]. 
*/
proc sql;
  create table tempdat.pikyearspine_cpr as select
    a.*, b.pik, b.[year], b.[geocode], b.[latitude], b.[longitude], b.[quality flag for lat/lon]
  from interm.pikspine as a 
    left join res21.[residence file 1] as b
    on a.pik = b.pik where [year]>=2000 order by b.pik, b.[year];
quit;
 
proc sort data = tempdat.pikyearspine_cpr nodupkey;
  by pik [year] [geocode] [latitude] [longitude] [quality flag for lat/lon];
run;

proc sql;
  create table tempdat.pikyearspine_cpr as select
    a.*, b.pik, b.[year], b.[geocode]
    , b.[latitude], b.[longitude], b.[quality flag for lat/lon], b.dropflag
  from tempdat.pikyearspine_cpr as a 
    left join tempdat.dups[years with dups] as b
    on a.pik = b.pik and a.[year] = b.[year] and a.[geocode] = b.[geocode] and
	a.[latitude] = b.[latitude] and a.[longitude] = b.[longitude] and
	a.[quality flag for lat/lon] = b.[quality flag for lat/lon]
    order by a.pik, a.[year];
quit;

data tempdat.pikyearspine_cpr;
  set tempdat.pikyearspine_cpr;
  if dropflag = 1 then delete;
  drop dropflag;
  [county id] = SUBSTR([geocode],1,5);
run;
data tempdat.pikyearspine_cpr;
  set tempdat.pikyearspine_cpr;
  drop [geocode];
run;

proc sort data = tempdat.pikyearspine_cpr nodupkey;
  by pik [year];
run;


proc sql;
  create table tempdat.pikyearspine_rcf as select
    a.*, b.pik, b.[year], b.[county id], b.[latitude], b.[longitude]
  from interm.pikspine as a 
    left join res21.[residence file 2] as b
    on a.pik = b.pik where [year]>=2000 order by b.pik, b.[year];
quit;

proc append base = interm.pikyearspine data=tempdat.pikyearspine_cpr;
run;
proc append base = interm.pikyearspine data=tempdat.pikyearspine_rcf;
run;

data interm.pikyearspine;
  set interm.pikyearspine;
  rename [year] = year;
run;

proc sort data = interm.pikyearspine nodupkey;
  by pik year;
run;

proc datasets library = tempdat nolist;
  delete pikyearspine_cpr pikyearspine_rcf;
run; quit;


/* 
Now get the pik-year WORK HISTORY for anyone who ever worked in our states
*/
%macro pikworkhistory;
  %let stcodes = al ar az ca co dc de ia il in ks md me mo mt nd ne nm nv ok tn va wy;
  %local I next_st;
  %let I = 1;
  %do %while (%scan(&stcodes, &I) ne );
    %let next_st = %scan(&stcodes, &I);
      * GET WORKER HISTORIES FOR A STATE;
      *temporary file of all ehf variables, for 2000 or later;
      libname ehf&next_st "[directory with state-level EHF files, index states by &next_st.]";
      DATA tempdat.ehf_&next_st._var;
	SET ehf&next_st..ehf_&next_st (KEEP = year pik [Q1 earnings] [Q2 earnings] [Q3 earnings] [Q4 earnings] [annual earnings] sein seinunit);
	WHERE year>= 2000;
      RUN;
      
      proc sql; 
	create table tempdat.ehf_&next_st._var as
	select pik, year, 
	sum([annual earnings]) as [annual earnings],
	sum([Q1 earnings]) as [Q1 earnings], 
	sum([Q2 earnings]) as [Q2 earnings], 
	sum([Q3 earnings]) as [Q3 earnings], 
	sum([Q4 earnings]) as [Q4 earnings] 
	from tempdat.ehf_&next_st._var
	group by pik, year
	order by pik, year;
      quit;

      proc append base = tempdat.ehf_icf_sts
	data = tempdat.ehf_&next_st._var;
      run;

      proc datasets library = tempdat nolist;
	delete ehf_&next_st._var;
      run; quit;
    %let I = %eval(&I + 1);
  %end;
%mend;

%pikworkhistory;

* COMBINE JOBS FROM MULTIPLE STATES;
proc sql; 
  create table tempdat.ehf_icf_sts as
  select pik, year, 
  sum([annual earnings]) as [annual earnings], 
  sum([Q1 earnings]) as [Q1 earnings], 
  sum([Q2 earnings]) as [Q2 earnings], 
  sum([Q3 earnings]) as [Q3 earnings], 
  sum([Q4 earnings]) as [Q4 earnings]
  from tempdat.ehf_icf_sts
  group by pik, year
  order by pik, year;
quit;


* MERGE PIKYEARSPINE TO WORK HISTORY; 
proc sql;
  create table interm.ehf_icf_spine as
  select a.*, b.*
  from interm.pikyearspine as a
    left join tempdat.ehf_icf_sts as b
    on a.pik = b.pik and a.year = b.year
  order by a.pik, a.year;
quit;


* CALCULATE UNEMPLOYMENT FROM 23 STATES EHF FILES;
* two definitions: zero earnings or earnings below half time minimum wage;
data interm.ehf_icf_spine;
  set interm.ehf_icf_spine;
  if [Q1 earnings] = 0 or missing([Q1 earnings]) then unemp1 = 1; else unemp1 = 0;
  if [Q2 earnings] = 0 or missing([Q2 earnings]) then unemp2 = 1; else unemp2 = 0;
  if [Q3 earnings] = 0 or missing([Q3 earnings]) then unemp3 = 1; else unemp3 = 0;
  if [Q4 earnings] = 0 or missing([Q4 earnings]) then unemp4 = 1; else unemp4 = 0;
  unemp_instate = (unemp1 + unemp2 + unemp3 + unemp4)/4;
  if [Q1 earnings] < 1400 or missing([Q1 earnings]) then min_unemp1 = 1; else min_unemp1 = 0;
  if [Q2 earnings] < 1400 or missing([Q2 earnings]) then min_unemp2 = 1; else min_unemp2 = 0;
  if [Q3 earnings] < 1400 or missing([Q3 earnings]) then min_unemp3 = 1; else min_unemp3 = 0;
  if [Q4 earnings] < 1400 or missing([Q4 earnings]) then min_unemp4 = 1; else min_unemp4 = 0;
  min_unemp_instate = (min_unemp1 + min_unemp2 + min_unemp3 + min_unemp4)/4;
run;
data interm.ehf_icf_spine;
  set interm.ehf_icf_spine (drop = [Q1 earnings] [Q2 earnings] [Q3 earnings] [Q4 earnings] unemp1 unemp2 unemp3 unemp4 min_unemp1 min_unemp2 min_unemp3 min_unemp4);
run;


* MERGE TO NATIONAL INDICATORS FILE;
* (DON'T DO THIS UNTIL MERGED TO PIKYEARSPINE);
proc sql; 
  create table interm.ehf_icf_spine as
  select a.*, b.*
  from interm.ehf_icf_spine as a
    left join indic.ehf_us_indicators as b
    on a.pik = b.pik and a.year = b.year
  order by a.pik, a.year;
quit;

* CALCULATE UNEMPLOYMENT FROM NATIONAL INDICATORS FILE;
data interm.ehf_icf_spine;
  set interm.ehf_icf_spine;
  if [NIF num. states with earnings]_[Q1 earnings] = 0 or missing([NIF num. states with earnings]_[Q1 earnings]) then unemp1 = 1; else unemp1 = 0;
  if [NIF num. states with earnings]_[Q2 earnings] = 0 or missing([NIF num. states with earnings]_[Q2 earnings]) then unemp2 = 1; else unemp2 = 0;
  if [NIF num. states with earnings]_[Q3 earnings] = 0 or missing([NIF num. states with earnings]_[Q3 earnings]) then unemp3 = 1; else unemp3 = 0;
  if [NIF num. states with earnings]_[Q4 earnings] = 0 or missing([NIF num. states with earnings]_[Q4 earnings]) then unemp4 = 1; else unemp4 = 0;
  unemp_all = (unemp1 + unemp2 + unemp3 + unemp4)/4;
run;
data interm.ehf_icf_spine;
  set interm.ehf_icf_spine (drop = [NIF num. states with earnings]_[Q1 earnings] [NIF num. states with earnings]_[Q2 earnings] [NIF num. states with earnings]_[Q3 earnings] [NIF num. states with earnings]_[Q4 earnings] unemp1 unemp2 unemp3 unemp4);
  if missing([annual earnings]) then [annual earnings] = 0;
run;


* GET WORKER CHARACTERISTICS;
DATA tempdat.icf_controls;
  SET controls.icf_us (keep = [date of birth] [self reported gender] [self reported race] [self reported ethnicity] [educational attainment] pik);
  BY pik;
RUN;

* MERGE WORKER CHARACTERISTICS TO WORKER PIKS;
PROC SQL;
  CREATE TABLE interm.pikspine AS
  SELECT
    A.*,
    B.*
  FROM interm.pikspine AS a
    LEFT JOIN tempdat.icf_controls AS b 
    ON A.pik = B.pik;
QUIT;

* MERGE WORKER date of birth TO WORKER HISTORIES TO GET AGE;
PROC SQL;
  CREATE TABLE interm.ehf_icf_spine AS
  SELECT
    A.*,
    B.[date of birth], B.pik
  FROM interm.ehf_icf_spine AS a
    LEFT JOIN tempdat.icf_controls AS b 
    ON A.pik = B.pik;
QUIT;


PROC DATASETS library = tempdat nolist;
  DELETE icf_controls;
RUN; QUIT; 

* CONSTRUCT WORKER VARIABLES;
DATA interm.ehf_icf_spine;
  SET interm.ehf_icf_spine;
  yeardate = mdy(1,1,year);
  FORMAT yeardate [date of birth] date9.;
  age=yrdif ([date of birth],yeardate);
  y = [latitude]/1000000;
  x = [longitude]/1000000;
  DROP [latitude] [longitude];
run;

proc sort data = interm.ehf_icf_spine nodupkey;
    by pik year;
run; 

data interm.ehf_icf_spine;
  set interm.ehf_icf_spine (drop = [quality flag for lat/lon] [date of birth] yeardate);
  if age < 18 then delete;
  if age > 65 then delete;
run;

DATA interm.pikspine;
  SET interm.pikspine (drop = [date of birth]);
  IF [self reported gender] = '[female flag]' THEN female = 1;
    ELSE female = 0;
  IF [self reported ethnicity] = '[hispanic flag]' THEN hispanic = 1;
    ELSE hispanic = 0;
run;

DATA interm.pikspine;
  SET interm.pikspine;
  
  * create an array for dummies with 3 bytes of storage each;
  * {*} just counts the number of categories;
  * race skips category [] for some reason - so create empty;
  ARRAY dummys {*} 3. white black ind_nat asian haw_pi empty more_race;
  DO i = 1 to 7;
    dummys(i) = 0;
  END;
  dummys(race) = 1;

  ARRAY eddums {*} 3. nohigh highsch somecoll college;
  DO i = 1 to 4;
    eddums(i) = 0;
  END;
  eddums(educ_c) = 1;

  DROP empty i [self reported gender] [self reported race] [self reported ethnicity] [educational attainment] haw_pi more_race;
RUN;

proc sort data = interm.pikspine nodupkey;
    by pik;
run; 

data temp;
  set interm.ehf_icf_spine (keep = pik);
run;
proc sort data = temp nodupkey;
    by pik;
run; 
data interm.pikspine; 
  merge temp(in=innew) interm.pikspine(in=inold);
  by pik;
  if (inold=1 and innew=1) then output interm.pikspine;
run;


* Delete intermediate datasets;
proc datasets library = tempdat nolist;
  delete ehf_icf_sts;
run; quit;
proc datasets library = interm nolist;
  delete pikyearspine;
run; quit;
proc datasets library = work nolist;
  delete temp;
run; quit;
